Análisis de tweets en Colombia

import pandas as pd
import numpy as np
import plotly.express as px
import re
import spacy
import preprocessor as p
from emoji import demojize
from bertopic import BERTopic
!python -m spacy download es_core_news_sm
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")
Collecting es-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.5.0/es_core_news_sm-3.5.0-py3-none-any.whl (12.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.9/12.9 MB 20.4 MB/s eta 0:00:00m eta 0:00:010:01:01
Requirement already satisfied: spacy<3.6.0,>=3.5.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from es-core-news-sm==3.5.0) (3.5.4)
Requirement already satisfied: setuptools in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (68.0.0)
Requirement already satisfied: jinja2 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.1.2)
Requirement already satisfied: packaging>=20.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (23.1)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.10.9)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (6.3.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.0.12)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.1.2)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (4.65.0)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.0.8)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (0.9.0)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2.0.7)
Requirement already satisfied: thinc<8.2.0,>=8.1.8 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (8.1.10)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2.31.0)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.0.4)
Requirement already satisfied: pathy>=0.10.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (0.10.2)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.0.9)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2.0.8)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.3.0)
Requirement already satisfied: numpy>=1.15.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.24.4)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2.4.6)
Requirement already satisfied: typing-extensions>=4.2.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (4.7.0)
Requirement already satisfied: certifi>=2017.4.17 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2023.5.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (1.26.16)
Requirement already satisfied: idna<4,>=2.5 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.4)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (3.1.0)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (0.1.0)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from thinc<8.2.0,>=8.1.8->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (0.7.9)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (8.1.3)
Requirement already satisfied: MarkupSafe>=2.0 in /home/fxr/.local/share/virtualenvs/capir_transfronteriza2_2023-f1a4fPBO/lib/python3.8/site-packages (from jinja2->spacy<3.6.0,>=3.5.0->es-core-news-sm==3.5.0) (2.1.3)

[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
✔ Download and installation successful
You can now load the package via spacy.load('es_core_news_sm')
df = pd.read_csv('../data/processed/tweets.csv')
print(len(df))
/tmp/ipykernel_186250/1855968218.py:1: DtypeWarning: Columns (6,10,18,19,20,26,28,38,39,40,45,46,47,48,54) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('../data/processed/tweets.csv')
200827
# split column into two columns after '|'
df[['date', 'time']] = df['local_time'].str.split('T', expand=True)
df.head(1)
query id timestamp_utc local_time user_screen_name text possibly_sensitive retweet_count like_count reply_count ... media_alt_texts mentioned_names mentioned_ids hashtags intervention_type intervention_text intervention_url country date time
0 from:TommyZambranoM 1.638175e+18 1679406309 2023-03-21T13:45:09 TommyZambranoM Los Nacionalistas para lograr la renovación de... 0.0 30.0 117.0 58.0 ... NaN pnh_oficial 201589327 librenuncamas NaN NaN NaN Honduras 2023-03-21 13:45:09

1 rows × 64 columns

# convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d', errors='coerce')
print(df.shape)
df.head(1)
(200827, 64)
query id timestamp_utc local_time user_screen_name text possibly_sensitive retweet_count like_count reply_count ... media_alt_texts mentioned_names mentioned_ids hashtags intervention_type intervention_text intervention_url country date time
0 from:TommyZambranoM 1.638175e+18 1679406309 2023-03-21T13:45:09 TommyZambranoM Los Nacionalistas para lograr la renovación de... 0.0 30.0 117.0 58.0 ... NaN pnh_oficial 201589327 librenuncamas NaN NaN NaN Honduras 2023-03-21 13:45:09

1 rows × 64 columns

UnidosxlaVidaCo = df[df['user_screen_name'] == 'UnidosxlaVidaCo']
print(UnidosxlaVidaCo.shape)
UnidosxlaVidaCo.sort_values(by='date', ascending=True)
(7830, 64)
query id timestamp_utc local_time user_screen_name text possibly_sensitive retweet_count like_count reply_count ... media_alt_texts mentioned_names mentioned_ids hashtags intervention_type intervention_text intervention_url country date time
179249 from:UnidosxlaVidaCo 8.081429e+16 1308102603 2011-06-15T01:50:03 UnidosxlaVidaCo Unidos por la Vida Colombia se estrena en Twit... NaN 0.0 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN Colombia 2011-06-15 01:50:03
179248 from:UnidosxlaVidaCo 8.133618e+16 1308227032 2011-06-16T12:23:52 UnidosxlaVidaCo Sabías que estamos promoviendo un Proyecto de ... NaN 3.0 0.0 1.0 ... NaN NaN NaN NaN NaN NaN NaN Colombia 2011-06-16 12:23:52
179247 from:UnidosxlaVidaCo 8.133723e+16 1308227280 2011-06-16T12:28:00 UnidosxlaVidaCo Acto Legislativo para defender la vida http://... 0.0 0.0 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN Colombia 2011-06-16 12:28:00
179246 from:UnidosxlaVidaCo 8.280595e+16 1308577452 2011-06-20T13:44:12 UnidosxlaVidaCo Festival de la Vida en la JMJ Madrid 2011 http... 0.0 1.0 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN Colombia 2011-06-20 13:44:12
179245 from:UnidosxlaVidaCo 8.574863e+16 1309279040 2011-06-28T16:37:20 UnidosxlaVidaCo Entra a nuestra página http://www.unidosporlav... 0.0 1.0 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN Colombia 2011-06-28 16:37:20
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
171424 from:UnidosxlaVidaCo 1.630978e+18 1677690470 2023-03-01T17:07:50 UnidosxlaVidaCo No hubo aborto y aún así, Insisten en que en e... 0.0 21.0 28.0 1.0 ... NaN NaN NaN quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:07:50
171423 from:UnidosxlaVidaCo 1.630979e+18 1677690574 2023-03-01T17:09:34 UnidosxlaVidaCo La verdad es que ella falleció por causas leja... 0.0 6.0 8.0 0.0 ... NaN corteidh 190706828 quienedbeatriz NaN NaN NaN Colombia 2023-03-01 17:09:34
171422 from:UnidosxlaVidaCo 1.630981e+18 1677691259 2023-03-01T17:20:59 UnidosxlaVidaCo #QuienEsBeatriz « LideresXlaVida: Beatriz Vs. ... NaN 6.0 6.0 0.0 ... NaN NaN NaN quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:20:59
171421 from:UnidosxlaVidaCo 1.630982e+18 1677691337 2023-03-01T17:22:17 UnidosxlaVidaCo Una nueva intervención de la @CorteIDH para im... NaN 6.0 4.0 0.0 ... NaN corteidh 190706828 quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:22:17
171420 from:UnidosxlaVidaCo 1.630982e+18 1677691376 2023-03-01T17:22:56 UnidosxlaVidaCo #QuienEsBeatriz y porque la @CorteIDH quiere c... NaN 8.0 7.0 0.0 ... NaN corteidh 190706828 quienesbeatriz NaN NaN NaN Colombia 2023-03-01 17:22:56

7830 rows × 64 columns

domains_list = UnidosxlaVidaCo['domains'].value_counts()
top_domains = domains_list.nlargest(20)
top_domains
domains
fb.me                     1231
bit.ly                     242
unidosporlavida.com        193
facebook.com               171
instagram.com              125
sumall.com                  98
youtube.com                 68
lifenews.com                40
citizengo.org               36
youtu.be                    33
20ft.net                    33
aciprensa.com               19
votocatolico.co             18
actuall.com                 15
shar.es                     15
liveactionnews.org          15
twitter.com                 12
es.gaudiumpress.org         12
religionenlibertad.com      10
razonmasfe.com               8
Name: count, dtype: int64
hashtags = UnidosxlaVidaCo['hashtags'].to_list()
# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]
# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]
# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]
# count items on list
hashtags_count = pd.Series(hashtags).value_counts()
top_hashtags = hashtags_count.nlargest(20)
top_hashtags
sialavida                647
aborto                   416
9marchaxlavida           373
noalaborto               325
colombiaesprovida        295
eutanasia                157
procuradorordóñez        139
sialprocurador           138
yosoyprovida             135
soyprovida               108
negocio                  106
repost                   100
todavidaimporta          100
elijolas2vidas            98
colombia                  93
eutanasiano               91
abortocero                91
fiestaxlavida             91
4mayo7marchaporlavida     89
caravanaporlavida         88
Name: count, dtype: int64
users = UnidosxlaVidaCo['mentioned_names'].to_list()
# remove nan items from list
users = [x for x in users if not pd.isna(x)]
# split items into a list based on a delimiter
users = [x.split('|') for x in users]
# flatten list of lists
users = [item for sublist in users for item in sublist]
# count items on list
users_count = pd.Series(users).value_counts()
top_users = users_count.nlargest(10)
top_users
marceposada        196
colombiaprovida    194
cconstitucional    176
monicaroa          173
sialprocurador     106
unidosxlavidaco    105
noticiasrcn         83
7marcofidelr        62
amadarosa           59
referendoxvida      51
Name: count, dtype: int64
# plot the data using plotly
fig = px.line(UnidosxlaVidaCo, 
              x='date', 
              y='like_count', 
              title='Likes over Time',
              template='plotly_dark', 
              hover_data=['text'])

# show the plot
fig.show()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
STOP_WORDS = nlp.Defaults.stop_words

def filter_stopwords(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

UnidosxlaVidaCo['preprocess'] = UnidosxlaVidaCo['text'].apply(filter_stopwords)
token_counts = UnidosxlaVidaCo["preprocess"].str.split(expand=True).stack().value_counts()[:20]

token_counts
vida                 2070
aborto               1097
colombia              719
sialavida             661
colombiaesprovida     437
mayo                  390
q                     388
noalaborto            370
eutanasia             323
derecho               323
gracias               309
provida               308
muerte                268
feliz                 268
d                     263
voz                   250
mujer                 222
familia               210
mujeres               204
concepción            191
Name: count, dtype: int64
UnidosxlaVidaCo['hour'] = UnidosxlaVidaCo['time'].str.split(":").str[0]
hours_count = UnidosxlaVidaCo['hour'].value_counts()
top_hours = hours_count.nlargest(15)
top_hours
hour
16    786
15    737
17    677
14    622
19    525
18    519
13    519
12    448
00    426
20    403
22    403
21    348
23    348
02    257
01    253
Name: count, dtype: int64
UnidosxlaVidaCo['source_name'].value_counts()
source_name
Twitter for iPhone             2031
Twitter Web App                1706
Twitter Web Client             1487
Facebook                       1468
Twitter for Android             412
Mobile Web                      163
TweetDeck                       133
erased88075                     131
Twitter for Websites            124
Instagram                        99
UberSocial for iPhone            22
Mobile Web (M2)                  12
iOS                              11
Twitter for Android Tablets      10
Twitter for Mac                   7
Tweeet! on iOS                    4
Hootsuite Inc.                    3
Buffer                            3
Hootsuite                         2
Twibbon                           1
Periscope                         1
Name: count, dtype: int64

Topics

# Remove urls
p.set_options(p.OPT.URL)
UnidosxlaVidaCo['text_clean'] = UnidosxlaVidaCo['text'].apply(lambda x: p.clean(x))

# Tokenize mentions and hashtags
p.set_options(p.OPT.MENTION)
UnidosxlaVidaCo['text_clean'] = UnidosxlaVidaCo['text_clean'].apply(lambda x: p.tokenize(x))

# Replace emojis with descriptions
UnidosxlaVidaCo['text_clean'] = UnidosxlaVidaCo['text_clean'].apply(lambda x: demojize(x))
docs = UnidosxlaVidaCo['preprocess']
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)
2023-06-29 16:20:59,948 - BERTopic - Transformed documents to Embeddings
2023-06-29 16:21:08,856 - BERTopic - Reduced dimensionality
2023-06-29 16:21:13,178 - BERTopic - Clustered reduced embeddings
freq = topic_model.get_topic_info(); freq.head(5)
Topic Count Name Representation Representative_Docs
0 -1 2501 -1_aborto_movida_sialavida_vida [aborto, movida, sialavida, vida, colombia, oe... [vida madre bebé nacer valiosas derechos provi...
1 0 363 0_totalmente_mmmm_desconoce_ [totalmente, mmmm, desconoce, , , , , , , ] [mmmm, totalmente, totalmente]
2 1 130 1_colombia_aborto_demanda_despenalización [colombia, aborto, demanda, despenalización, c... [aborto colombia súmate comparte noqueremosabo...
3 2 122 2_mujeres_mujer_mujeresvida_diainternacionalde... [mujeres, mujer, mujeresvida, diainternacional... [mujer imprescindible progreso país gobiernos ...
4 3 119 3_referendo_vida_vidaesesperanza_ama [referendo, vida, vidaesesperanza, ama, todavi... [faltan referendo vida uxv, faltan referendo v...
topic_model.visualize_topics()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
topic_model.update_topics(docs, n_gram_range=(1, 2))
topic_model.reduce_topics(docs, nr_topics=11)
2023-06-29 16:21:23,924 - BERTopic - Reduced number of topics from 147 to 11
<bertopic._bertopic.BERTopic at 0x7febe1c94dc0>
topic_model.visualize_topics()
Unable to display output for mime type(s): application/vnd.plotly.v1+json
tweets = UnidosxlaVidaCo['text_clean'].to_list()
timestamps = UnidosxlaVidaCo['local_time'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)
20it [00:01, 11.21it/s]
Unable to display output for mime type(s): application/vnd.plotly.v1+json